House Prices

Author

Haomin Zhang

Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting.

Description

Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

Challenge

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, predict the final price of each home.

Set Environment

library(tidyverse)
library(tidymodels)
library(conflicted)
conflicts_prefer(dplyr::filter())
conflicts_prefer(dplyr::lag())
conflicts_prefer(recipes::step())

Read Data

house_prices_train <-
  read.csv("./house_prices_train.csv")
house_prices_test <-
  read.csv("./house_prices_test.csv")
house_prices_data <- bind_rows(
  list(
    "train"=house_prices_train, 
    "test"=house_prices_test
  ), .id = "From"
)
glimpse(house_prices_data)
Rows: 2,919
Columns: 82
$ From          <chr> "train", "train", "train", "train", "train", "train", "t…
$ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
$ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
$ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
$ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
$ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
$ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
$ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
$ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
$ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
$ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
$ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
$ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
$ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
$ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
$ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
$ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
$ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
$ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
$ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
$ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
$ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
$ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
$ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
$ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
$ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
$ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
$ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
$ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
$ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
$ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
$ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
$ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
$ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
$ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
$ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
$ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
$ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
$ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
$ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
$ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
$ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
$ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
$ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
$ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
$ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
$ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
$ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
$ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
$ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
$ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
$ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
$ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
$ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
$ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
$ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
$ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
$ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
$ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
$ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
$ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
$ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
$ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
$ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
$ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
$ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
$ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
$ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
$ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Count NA

house_prices_data |> select(-SalePrice) |>
  summarise(
    across(everything(), \(x) sum(is.na(x)))) |>
  pivot_longer(
    everything(), names_to="columns", values_to="na_count"
  ) |> filter(na_count!=0)
# A tibble: 34 × 2
   columns     na_count
   <chr>          <int>
 1 MSZoning           4
 2 LotFrontage      486
 3 Alley           2721
 4 Utilities          2
 5 Exterior1st        1
 6 Exterior2nd        1
 7 MasVnrType        24
 8 MasVnrArea        23
 9 BsmtQual          81
10 BsmtCond          82
# ℹ 24 more rows

Convert Data

house_prices_data <- house_prices_data |>
  mutate(
    MSSubClass=as.factor(MSSubClass),
    Alley=if_else(is.na(Alley), "None", Alley),
    YearBuilt=YrSold - YearBuilt,
    YearBuilt=if_else(
      YearBuilt < 0, 0, YearBuilt
    ),
    YearRemodAdd=YrSold - YearRemodAdd,
    YearRemodAdd=case_when(
      YearRemodAdd <= 0 ~ "0",
      YearRemodAdd / 10 < 1 ~ "1-10",
      YearRemodAdd / 10 < 2 ~ "11-20",
      YearRemodAdd / 10 < 3 ~ "21-30",
      YearRemodAdd / 10 < 4 ~ "31-40",
      YearRemodAdd / 10 < 5 ~ "41-50",
      YearRemodAdd / 10 <= 6 ~ "51-60",
    ),
    YearRemodAdd=as.factor(YearRemodAdd),
    Exterior2nd=if_else(
      Exterior2nd == Exterior1st, "None", Exterior2nd
    ),
    MasVnrType=if_else(
      is.na(MasVnrType) & is.na(MasVnrArea),
      "None", MasVnrType
    ),
    MasVnrArea=if_else(
      is.na(MasVnrArea), 0, MasVnrArea
    ),
    ExterQual=case_match(
      ExterQual,
      "Ex" ~ 5,
      "Gd" ~ 4,
      "TA" ~ 3,
      "Fa" ~ 2,
      "Po" ~ 1
    ),
    ExterCond=case_match(
      ExterCond,
      "Ex" ~ 5,
      "Gd" ~ 4,
      "TA" ~ 3,
      "Fa" ~ 2,
      "Po" ~ 1
    ),
    BsmtQual=case_when(
      BsmtQual == "Ex" ~ 5,
      BsmtQual == "Gd" ~ 4,
      BsmtQual == "TA" ~ 3,
      BsmtQual == "Fa" ~ 2,
      BsmtQual == "Po" ~ 1,
      is.na(BsmtQual) ~ 0,
    ),
    BsmtCond=case_when(
      BsmtCond == "Ex" ~ 5,
      BsmtCond == "Gd" ~ 4,
      BsmtCond == "TA" ~ 3,
      BsmtCond == "Fa" ~ 2,
      BsmtCond == "Po" ~ 1,
      is.na(BsmtCond) ~ 0,
    ),
    BsmtExposure=case_when(
      BsmtExposure == "Gd" ~ 4,
      BsmtExposure == "Av" ~ 3,
      BsmtExposure == "Mn" ~ 2,
      BsmtExposure == "No" ~ 1,
      is.na(BsmtExposure) ~ 0,
    ),
    BsmtFinType1=case_when(
      BsmtFinType1 == "GLQ" ~ 6,
      BsmtFinType1 == "ALQ" ~ 5,
      BsmtFinType1 == "BLQ" ~ 4,
      BsmtFinType1 == "Rec" ~ 3,
      BsmtFinType1 == "LwQ" ~ 2,
      BsmtFinType1 == "Unf" ~ 1,
      is.na(BsmtFinType1) ~ 0,
    ),
    BsmtFinType2=case_when(
      BsmtFinType2 == "GLQ" ~ 6,
      BsmtFinType2 == "ALQ" ~ 5,
      BsmtFinType2 == "BLQ" ~ 4,
      BsmtFinType2 == "Rec" ~ 3,
      BsmtFinType2 == "LwQ" ~ 2,
      BsmtFinType2 == "Unf" ~ 1,
      is.na(BsmtFinType2) ~ 0,
    ),
    across(
      c(BsmtFinSF1, BsmtFinSF2:TotalBsmtSF),
      \(x) x=if_else(is.na(x), 0, x)
    ),
    HeatingQC=case_match(
      HeatingQC,
      "Ex" ~ 5,
      "Gd" ~ 4,
      "TA" ~ 3,
      "Fa" ~ 2,
      "Po" ~ 1
    ),
    CentralAir=case_match(
      CentralAir,
      "Y" ~ 1,
      "N" ~ 0
    ),
    KitchenQual=case_match(
      KitchenQual,
      "Ex" ~ 5,
      "Gd" ~ 4,
      "TA" ~ 3,
      "Fa" ~ 2,
      "Po" ~ 1
    ),
    Functional=if_else(
      is.na(Functional), "Typ", Functional
    ),
    FireplaceQu=case_when(
      FireplaceQu == "Ex" ~ 5,
      FireplaceQu == "Gd" ~ 4,
      FireplaceQu == "TA" ~ 3,
      FireplaceQu == "Fa" ~ 2,
      FireplaceQu == "Po" ~ 1,
      is.na(FireplaceQu) ~ 0,
    ),
    GarageYrBlt=YrSold - GarageYrBlt,
    GarageYrBlt=case_when(
      GarageYrBlt <= 0 ~ "0",
      GarageYrBlt / 10 < 1 ~ "1-10",
      GarageYrBlt / 10 < 2 ~ "11-20",
      GarageYrBlt / 10 < 3 ~ "21-30",
      GarageYrBlt / 10 < 4 ~ "31-40",
      GarageYrBlt / 10 < 5 ~ "41-50",
      GarageYrBlt / 10 < 6 ~ "51-60",
      GarageYrBlt / 10 < 7 ~ "61-70",
      GarageYrBlt / 10 < 8 ~ "71-80",
      GarageYrBlt / 10 < 9 ~ "81-90",
      GarageYrBlt / 10 < 10 ~ "91-100",
      GarageYrBlt / 10 >= 10 ~ "100+",
    ),
    across(
      c(GarageType:GarageFinish),
      \(x) x=if_else(is.na(GarageType), "None", x)
    ),
    GarageYrBlt=as.factor(GarageYrBlt) |>
      fct_relevel("100+", "None", after=Inf),
    GarageQual=case_when(
      GarageQual == "Ex" ~ 5,
      GarageQual == "Gd" ~ 4,
      GarageQual == "TA" ~ 3,
      GarageQual == "Fa" ~ 2,
      GarageQual == "Po" ~ 1,
      is.na(GarageQual) ~ 0
    ),
    GarageCond=case_when(
      GarageCond == "Ex" ~ 5,
      GarageCond == "Gd" ~ 4,
      GarageCond == "TA" ~ 3,
      GarageCond == "Fa" ~ 2,
      GarageCond == "Po" ~ 1,
      is.na(GarageCond) ~ 0
    ),
    PoolQC=case_when(
      PoolQC == "Ex" ~ 4,
      PoolQC == "Gd" ~ 3,
      PoolQC == "TA" ~ 2,
      PoolQC == "Fa" ~ 1,
      is.na(PoolQC) ~ 0
    ),
    Fence=case_when(
      Fence == "GdPrv" ~ 4,
      Fence == "MnPrv" ~ 3,
      Fence == "GdWo" ~ 2,
      Fence == "MnWw" ~ 1,
      is.na(Fence) ~ 0
    ),
    MiscFeature=if_else(
      MiscVal==0, "None", MiscFeature
    ),
    MiscFeature=if_else(
      is.na(MiscFeature), "Gar2", MiscFeature
    ),
    across(
      c(MoSold, YrSold),
      \(x) x=as.factor(x)
    ),
    across(
      where(is.character),
      \(x) x=as.factor(x) |> fct_infreq()
    ),
    GarageType=GarageType |>
      fct_relevel("None", after=Inf)
  )
glimpse(house_prices_data)
Rows: 2,919
Columns: 82
$ From          <fct> train, train, train, train, train, train, train, train, …
$ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ MSSubClass    <fct> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
$ MSZoning      <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
$ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
$ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
$ Street        <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
$ Alley         <fct> None, None, None, None, None, None, None, None, None, No…
$ LotShape      <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
$ LandContour   <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
$ Utilities     <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
$ LotConfig     <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
$ LandSlope     <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
$ Neighborhood  <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
$ Condition1    <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
$ Condition2    <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
$ BldgType      <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
$ HouseStyle    <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
$ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
$ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
$ YearBuilt     <dbl> 5, 31, 7, 91, 8, 16, 3, 36, 77, 69, 43, 1, 46, 1, 48, 78…
$ YearRemodAdd  <fct> 1-10, 31-40, 1-10, 31-40, 1-10, 11-20, 1-10, 31-40, 51-6…
$ RoofStyle     <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
$ RoofMatl      <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
$ Exterior1st   <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
$ Exterior2nd   <fct> None, None, None, Wd Shng, None, None, None, None, Wd Sh…
$ MasVnrType    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
$ MasVnrArea    <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
$ ExterQual     <dbl> 4, 3, 4, 3, 4, 3, 4, 3, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 3,…
$ ExterCond     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ Foundation    <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
$ BsmtQual      <dbl> 4, 4, 4, 3, 4, 4, 5, 4, 3, 3, 3, 5, 3, 4, 3, 3, 3, 0, 3,…
$ BsmtCond      <dbl> 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3,…
$ BsmtExposure  <dbl> 1, 4, 2, 1, 3, 1, 3, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 1,…
$ BsmtFinType1  <dbl> 6, 5, 6, 5, 6, 6, 6, 5, 1, 6, 3, 6, 5, 1, 4, 1, 5, 0, 6,…
$ BsmtFinSF1    <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
$ BsmtFinType2  <dbl> 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,…
$ BsmtFinSF2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ BsmtUnfSF     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
$ TotalBsmtSF   <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
$ Heating       <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
$ HeatingQC     <dbl> 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5, 3, 5, 3, 5, 5, 3, 5,…
$ CentralAir    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Electrical    <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
$ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
$ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
$ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
$ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
$ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
$ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
$ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
$ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ KitchenQual   <dbl> 4, 3, 4, 4, 4, 3, 4, 3, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 4,…
$ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
$ Functional    <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
$ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
$ FireplaceQu   <dbl> 0, 3, 3, 4, 3, 0, 4, 3, 3, 3, 0, 4, 0, 4, 2, 0, 3, 0, 0,…
$ GarageType    <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
$ GarageYrBlt   <fct> 1-10, 31-40, 1-10, 1-10, 1-10, 11-20, 1-10, 31-40, 71-80…
$ GarageFinish  <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
$ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
$ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
$ GarageQual    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ GarageCond    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ PavedDrive    <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
$ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
$ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
$ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
$ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
$ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PoolQC        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Fence         <dbl> 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0,…
$ MiscFeature   <fct> None, None, None, None, None, Shed, None, Shed, None, No…
$ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
$ MoSold        <fct> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
$ YrSold        <fct> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
$ SaleType      <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
$ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
$ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Draw Plot

train_data <- house_prices_data |>
  filter(From=="train") |>
  select(-From, -Id)
test_data <- house_prices_data |>
  filter(From=="test") |>
  select(-From, -Id, -SalePrice)
test_id <- house_prices_data |>
  filter(From=="test") |>
  select(Id)
columns <- test_data |> colnames()
sequences <- c("OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Fireplaces", "FireplaceQu", "GarageCars", "GarageQual", "GarageCond", "PoolQC", "Fence")
for(x in columns){
  if(is.factor(train_data[[x]]) | is.element(x, sequences)){
    barPlot <- train_data |>
      ggplot(aes(x=as.factor(.data[[x]]))) +
      geom_bar(
        aes(y=after_stat(count)),
        color="royalblue", fill="skyblue"
      ) + xlab(x) + theme_bw()
    print(barPlot)
    boxPlot <- train_data |>
      ggplot(aes(y=SalePrice)) +
      geom_boxplot(
        aes(x=as.factor(.data[[x]])),
        color="royalblue", fill="skyblue"
      ) + scale_y_continuous(
        breaks= seq(0, 800000, by=100000), labels=comma
      ) + xlab(x) + theme_bw()
    print(boxPlot)
  }else if(is.numeric(train_data[[x]])){
    IQR <- train_data |>
      select(x) |>
      filter(.data[[x]]!=0) |>
      unlist() |> IQR(na.rm=TRUE)
    width <- round(IQR/5, 0) + 1
    histogramPlot <- train_data |>
      ggplot(aes(x=.data[[x]])) +
      geom_histogram(
        na.rm=TRUE, binwidth=width, center=width/2,  
        color="royalblue", fill="skyblue"
      ) + theme_bw()
    print(histogramPlot)
    pointPlot <- train_data |>
      ggplot(aes(y=SalePrice)) +
      geom_point(
        aes(x=.data[[x]]), na.rm=TRUE, color="royalblue"
      ) + scale_y_continuous(
        breaks= seq(0, 800000, by=100000), labels=comma
      ) + theme_bw()
    print(pointPlot)
  }
}

train_data |>
  ggplot(aes(x=SalePrice)) +
  geom_histogram(
    na.rm=TRUE, binwidth=10000, center=5000, 
    color="royalblue", fill="skyblue"
  ) + scale_x_continuous(
    breaks= seq(0, 800000, by=100000), labels=comma
  )+ theme_bw()

train_data |>
  ggplot(aes(sample=SalePrice)) +
  geom_qq(color="slateblue") +
  geom_qq_line(color="royalblue", linewidth=1)+
  scale_y_continuous(
    breaks= seq(0, 800000, by=100000), labels=comma
  ) + theme_bw()

train_data <- train_data |>
  mutate(SalePrice=log(SalePrice, base=10))
train_data |>
  ggplot(aes(sample=SalePrice)) +
  geom_qq(color="slateblue") +
  geom_qq_line(color="royalblue", linewidth=1)+
  theme_bw()

Test Correlation

for(x in columns){
  if(is.factor(train_data[[x]])){
    cat("x=", x, "\n")
    kruskal.test(
      train_data$SalePrice, train_data[[x]]
    ) |> print()
  }else if(is.element(x, sequences)){
    cat("x=", x, "\n")
    kruskal.test(
      train_data$SalePrice, as.factor(train_data[[x]])
    ) |> print()
  }else if(is.numeric(train_data[[x]])){
    cat("x=", x, "\n")
    cor.test(
      train_data$SalePrice, train_data[[x]],
      method="kendall"
    ) |> print()
  }
}
x= MSSubClass 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 531.54, df = 14, p-value < 2.2e-16

x= MSZoning 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 270.07, df = 4, p-value < 2.2e-16

x= LotFrontage 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 14.823, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.2903612 

x= LotArea 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 17.95, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.3141744 

x= Street 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 3.0624, df = 1, p-value = 0.08013

x= Alley 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 45.714, df = 2, p-value = 1.184e-10

x= LotShape 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 150.95, df = 3, p-value < 2.2e-16

x= LandContour 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.31, df = 3, p-value = 1.048e-07

x= Utilities 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 0.40737, df = 1, p-value = 0.5233

x= LotConfig 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.724, df = 4, p-value = 3.298e-07

x= LandSlope 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 3.9388, df = 2, p-value = 0.1395

x= Neighborhood 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 868.53, df = 24, p-value < 2.2e-16

x= Condition1 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 74.404, df = 8, p-value = 6.493e-13

x= Condition2 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 17.774, df = 7, p-value = 0.01303

x= BldgType 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 69.801, df = 4, p-value = 2.501e-14

x= HouseStyle 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 185.43, df = 7, p-value < 2.2e-16

x= OverallQual 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 966.1, df = 9, p-value < 2.2e-16

x= OverallCond 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 238.77, df = 8, p-value < 2.2e-16

x= YearBuilt 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = -26.57, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
       tau 
-0.4685664 

x= YearRemodAdd 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 519.19, df = 6, p-value < 2.2e-16

x= RoofStyle 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 39.878, df = 5, p-value = 1.581e-07

x= RoofMatl 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 18.872, df = 7, p-value = 0.008597

x= Exterior1st 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 304.04, df = 14, p-value < 2.2e-16

x= Exterior2nd 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 35.231, df = 15, p-value = 0.002279

x= MasVnrType 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 270.81, df = 3, p-value < 2.2e-16

x= MasVnrArea 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 16.404, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.3172199 

x= ExterQual 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 683.44, df = 3, p-value < 2.2e-16

x= ExterCond 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 46.617, df = 4, p-value = 1.832e-09

x= Foundation 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 488.72, df = 5, p-value < 2.2e-16

x= BsmtQual 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 674.41, df = 4, p-value < 2.2e-16

x= BsmtCond 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 120.88, df = 4, p-value < 2.2e-16

x= BsmtExposure 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 191.03, df = 4, p-value < 2.2e-16

x= BsmtFinType1 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 348.88, df = 6, p-value < 2.2e-16

x= BsmtFinSF1 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 12.164, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.2208792 

x= BsmtFinType2 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 64.589, df = 6, p-value = 5.232e-12

x= BsmtFinSF2 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = -1.476, p-value = 0.14
alternative hypothesis: true tau is not equal to 0
sample estimates:
        tau 
-0.03071045 

x= BsmtUnfSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 7.2624, p-value = 3.802e-13
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.1274574 

x= TotalBsmtSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 24.837, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.4350183 

x= Heating 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 33.267, df = 5, p-value = 3.331e-06

x= HeatingQC 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 356.78, df = 4, p-value < 2.2e-16

x= CentralAir 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 143.2, df = 1, p-value < 2.2e-16

x= Electrical 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 131.42, df = 4, p-value < 2.2e-16

x= X1stFlrSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 23.511, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.4115564 

x= X2ndFlrSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 12.105, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.2324971 

x= LowQualFinSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = -2.5943, p-value = 0.00948
alternative hypothesis: true tau is not equal to 0
sample estimates:
        tau 
-0.05530811 

x= GrLivArea 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 31.079, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.5439421 

x= BsmtFullBath 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 73.976, df = 3, p-value = 6.006e-16

x= BsmtHalfBath 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 0.23762, df = 2, p-value = 0.888

x= FullBath 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 599.36, df = 3, p-value < 2.2e-16

x= HalfBath 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 179.32, df = 2, p-value < 2.2e-16

x= BedroomAbvGr 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 101.09, df = 7, p-value < 2.2e-16

x= KitchenAbvGr 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 43.206, df = 3, p-value = 2.225e-09

x= KitchenQual 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 661.48, df = 3, p-value < 2.2e-16

x= TotRmsAbvGrd 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 419.41, df = 11, p-value < 2.2e-16

x= Functional 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 32.56, df = 6, p-value = 1.274e-05

x= Fireplaces 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 406.84, df = 3, p-value < 2.2e-16

x= FireplaceQu 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 461.9, df = 5, p-value < 2.2e-16

x= GarageType 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 532.7, df = 6, p-value < 2.2e-16

x= GarageYrBlt 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 615.92, df = 12, p-value < 2.2e-16

x= GarageFinish 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 609.81, df = 3, p-value < 2.2e-16

x= GarageCars 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 702.51, df = 4, p-value < 2.2e-16

x= GarageArea 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 27.204, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.4781465 

x= GarageQual 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 192, df = 5, p-value < 2.2e-16

x= GarageCond 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 190.96, df = 5, p-value < 2.2e-16

x= PavedDrive 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 115.84, df = 2, p-value < 2.2e-16

x= WoodDeckSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 13.684, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
      tau 
0.2603486 

x= OpenPorchSF 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 18.724, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
     tau 
0.350161 

x= EnclosedPorch 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = -8.3273, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
       tau 
-0.1720941 

x= X3SsnPorch 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 2.5075, p-value = 0.01216
alternative hypothesis: true tau is not equal to 0
sample estimates:
       tau 
0.05347771 

x= ScreenPorch 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 3.8416, p-value = 0.0001222
alternative hypothesis: true tau is not equal to 0
sample estimates:
       tau 
0.08065437 

x= PoolArea 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = 2.2349, p-value = 0.02542
alternative hypothesis: true tau is not equal to 0
sample estimates:
       tau 
0.04780012 

x= PoolQC 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 5.9957, df = 3, p-value = 0.1118

x= Fence 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and as.factor(train_data[[x]])
Kruskal-Wallis chi-squared = 78.023, df = 4, p-value = 4.567e-16

x= MiscFeature 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 10.654, df = 4, p-value = 0.03075

x= MiscVal 

    Kendall's rank correlation tau

data:  train_data$SalePrice and train_data[[x]]
z = -2.3973, p-value = 0.01652
alternative hypothesis: true tau is not equal to 0
sample estimates:
        tau 
-0.05091716 

x= MoSold 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 14.278, df = 11, p-value = 0.218

x= YrSold 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 1.6459, df = 4, p-value = 0.8005

x= SaleType 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 156.38, df = 8, p-value < 2.2e-16

x= SaleCondition 

    Kruskal-Wallis rank sum test

data:  train_data$SalePrice and train_data[[x]]
Kruskal-Wallis chi-squared = 168.32, df = 5, p-value < 2.2e-16
train_data <- train_data |>
  select(-Street, -Utilities, -LandSlope, -BsmtFinSF2, -BsmtHalfBath, -PoolQC, -MoSold, -YrSold)
test_data <- test_data |>
  select(-Street, -Utilities, -LandSlope, -BsmtFinSF2, -BsmtHalfBath, -PoolQC, -MoSold, -YrSold)

Split Data

set.seed(1024)
split_data <- initial_split(train_data)
fold_data <- vfold_cv(train_data)

Create Model

model <- boost_tree(
  mtry=tune(), trees=1000, min_n=tune(),
  tree_depth=10, learn_rate=0.01) |>
  set_engine("xgboost", nthread=8, counts=FALSE) |>
  set_mode("regression")

Create Recipe

recipe <- recipe(SalePrice ~ ., data=split_data) |>
  step_impute_knn(
    MSZoning, LotFrontage, Electrical,
    impute_with=imp_vars(
      MSSubClass, Neighborhood
    )
  ) |>
  step_impute_knn(
    Exterior1st, Exterior2nd, MasVnrType,
    impute_with=imp_vars(
      MasVnrArea, ExterQual, ExterCond
    )
  ) |>
  step_impute_knn(
    BsmtQual, BsmtCond, BsmtExposure, BsmtFinType2, BsmtFullBath,
    impute_with=imp_vars(
      BsmtFinType1, BsmtFinSF1, BsmtUnfSF, TotalBsmtSF
    )
  ) |>
  step_impute_knn(
    GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond,
    impute_with=imp_vars(GarageType)
  ) |>
  step_impute_knn(
    KitchenQual,
    impute_with=imp_vars(KitchenAbvGr)
  ) |>
  step_impute_knn(
    SaleType,
    impute_with=imp_vars(SaleCondition)
  ) |>
  step_dummy(
    all_nominal_predictors(),
    one_hot=TRUE
  )

Create Workflow

workflow <- workflow() |>
  add_recipe(recipe) |>
  add_model(model)

Tune Model

tune <- workflow |>
  tune_bayes(
    resamples=fold_data,
    param_info=parameters(
      mtry=mtry_prop(), min_n()
    ),
    metrics=metric_set(rmse, rsq)
  )
tune |> autoplot()

tune |> show_best(metric="rmse")
# A tibble: 5 × 9
  min_n  mtry .metric .estimator   mean     n std_err .config .iter
  <int> <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>   <int>
1     6 0.100 rmse    standard   0.0519    10 0.00257 Iter4       4
2     5 0.107 rmse    standard   0.0520    10 0.00262 Iter6       6
3     5 0.102 rmse    standard   0.0521    10 0.00256 Iter7       7
4     2 0.101 rmse    standard   0.0522    10 0.00252 Iter2       2
5     6 0.117 rmse    standard   0.0524    10 0.00257 Iter9       9
tune |> show_best(metric="rsq")
# A tibble: 5 × 9
  min_n  mtry .metric .estimator  mean     n std_err .config .iter
  <int> <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>   <int>
1     6 0.100 rsq     standard   0.912    10 0.00698 Iter4       4
2     5 0.107 rsq     standard   0.912    10 0.00758 Iter6       6
3     2 0.101 rsq     standard   0.912    10 0.00735 Iter2       2
4     5 0.102 rsq     standard   0.912    10 0.00739 Iter7       7
5     6 0.117 rsq     standard   0.910    10 0.00731 Iter9       9
params <- tune |> select_best(metric="rmse")
final <- workflow |> finalize_workflow(params)

Validate Model

final |> last_fit(split_data) |> collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      0.0463 Preprocessor1_Model1
2 rsq     standard      0.931  Preprocessor1_Model1

Test Model

fit <- final |> fit(train_data)
predict <- fit |> predict(test_data) |>
  rename(SalePrice=.pred) |>
  mutate(SalePrice=10^SalePrice |> round(0))
submission <- test_id |> 
  select(Id) |>
  bind_cols(predict)
glimpse(submission)
Rows: 1,459
Columns: 2
$ Id        <int> 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, …
$ SalePrice <dbl> 126833, 158187, 184164, 192850, 188031, 174289, 170760, 1629…

Score

Boost Tree
Back to top